import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
rcParams['figure.figsize'] = 15, 5
sns.set_style('darkgrid')
new_breast_cancer_df = pd.read_csv('data.csv')
new_breast_cancer_df.head()
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | ... | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | NaN |
| 1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | ... | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | NaN |
| 2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | ... | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | NaN |
| 3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | ... | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | NaN |
| 4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | ... | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | NaN |
5 rows × 33 columns
new_breast_cancer_df.drop(labels=['Unnamed: 32', 'id'], axis=1, inplace=True)
new_breast_cancer_df.dropna(inplace=True)
new_breast_cancer_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 diagnosis 569 non-null object 1 radius_mean 569 non-null float64 2 texture_mean 569 non-null float64 3 perimeter_mean 569 non-null float64 4 area_mean 569 non-null float64 5 smoothness_mean 569 non-null float64 6 compactness_mean 569 non-null float64 7 concavity_mean 569 non-null float64 8 concave points_mean 569 non-null float64 9 symmetry_mean 569 non-null float64 10 fractal_dimension_mean 569 non-null float64 11 radius_se 569 non-null float64 12 texture_se 569 non-null float64 13 perimeter_se 569 non-null float64 14 area_se 569 non-null float64 15 smoothness_se 569 non-null float64 16 compactness_se 569 non-null float64 17 concavity_se 569 non-null float64 18 concave points_se 569 non-null float64 19 symmetry_se 569 non-null float64 20 fractal_dimension_se 569 non-null float64 21 radius_worst 569 non-null float64 22 texture_worst 569 non-null float64 23 perimeter_worst 569 non-null float64 24 area_worst 569 non-null float64 25 smoothness_worst 569 non-null float64 26 compactness_worst 569 non-null float64 27 concavity_worst 569 non-null float64 28 concave points_worst 569 non-null float64 29 symmetry_worst 569 non-null float64 30 fractal_dimension_worst 569 non-null float64 dtypes: float64(30), object(1) memory usage: 137.9+ KB
sns.pairplot(new_breast_cancer_df, hue='diagnosis', vars=['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean'])
plt.show()
/Users/nicholastran/anaconda3/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
sns.countplot(x=new_breast_cancer_df['diagnosis'])
plt.show()
sns.scatterplot(x = 'area_mean', y = 'smoothness_mean', hue = 'diagnosis', data = new_breast_cancer_df)
plt.show()
plt.figure(figsize=(20,10))
new_breast_cancer_df_hmap = new_breast_cancer_df.copy()
non_numeric_columns = new_breast_cancer_df_hmap.select_dtypes(exclude=['number']).columns
new_breast_cancer_df_hmap[non_numeric_columns] = new_breast_cancer_df_hmap[non_numeric_columns].apply(pd.to_numeric, errors='coerce')
sns.heatmap(new_breast_cancer_df_hmap.corr(), annot=True)
plt.show()
from sklearn.preprocessing import StandardScaler
X = new_breast_cancer_df.drop('diagnosis', axis=1)
y = new_breast_cancer_df['diagnosis']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(data=X_scaled, columns=X.columns)
X.head()
| radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.097064 | -2.073335 | 1.269934 | 0.984375 | 1.568466 | 3.283515 | 2.652874 | 2.532475 | 2.217515 | 2.255747 | ... | 1.886690 | -1.359293 | 2.303601 | 2.001237 | 1.307686 | 2.616665 | 2.109526 | 2.296076 | 2.750622 | 1.937015 |
| 1 | 1.829821 | -0.353632 | 1.685955 | 1.908708 | -0.826962 | -0.487072 | -0.023846 | 0.548144 | 0.001392 | -0.868652 | ... | 1.805927 | -0.369203 | 1.535126 | 1.890489 | -0.375612 | -0.430444 | -0.146749 | 1.087084 | -0.243890 | 0.281190 |
| 2 | 1.579888 | 0.456187 | 1.566503 | 1.558884 | 0.942210 | 1.052926 | 1.363478 | 2.037231 | 0.939685 | -0.398008 | ... | 1.511870 | -0.023974 | 1.347475 | 1.456285 | 0.527407 | 1.082932 | 0.854974 | 1.955000 | 1.152255 | 0.201391 |
| 3 | -0.768909 | 0.253732 | -0.592687 | -0.764464 | 3.283553 | 3.402909 | 1.915897 | 1.451707 | 2.867383 | 4.910919 | ... | -0.281464 | 0.133984 | -0.249939 | -0.550021 | 3.394275 | 3.893397 | 1.989588 | 2.175786 | 6.046041 | 4.935010 |
| 4 | 1.750297 | -1.151816 | 1.776573 | 1.826229 | 0.280372 | 0.539340 | 1.371011 | 1.428493 | -0.009560 | -0.562450 | ... | 1.298575 | -1.466770 | 1.338539 | 1.220724 | 0.220556 | -0.313395 | 0.613179 | 0.729259 | -0.868353 | -0.397100 |
5 rows × 30 columns
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
from sklearn.svm import SVC
y_train.dropna(inplace=True)
X_train = X_train.loc[y_train.index]
model = SVC()
model.fit(X_train,y_train)
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC()
predictions = model.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])
Confusion matrix [[104 1] [ 3 63]] True Positives(TP) = 104 True Negatives(TN) = 63 False Positives(FP) = 1 False Negatives(FN) = 3
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))
precision recall f1-score support
B 0.97 0.99 0.98 105
M 0.98 0.95 0.97 66
accuracy 0.98 171
macro avg 0.98 0.97 0.98 171
weighted avg 0.98 0.98 0.98 171
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']}
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
grid.fit(X_train,y_train)
Fitting 5 folds for each of 25 candidates, totalling 125 fits [CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.925 total time= 0.0s [CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.950 total time= 0.0s [CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.900 total time= 0.0s [CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.962 total time= 0.0s [CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.949 total time= 0.0s [CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.912 total time= 0.0s [CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.963 total time= 0.0s [CV 3/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.975 total time= 0.0s [CV 4/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.987 total time= 0.0s [CV 5/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.962 total time= 0.0s [CV 1/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.688 total time= 0.0s [CV 2/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.688 total time= 0.0s [CV 3/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.688 total time= 0.0s [CV 4/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.684 total time= 0.0s [CV 5/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.709 total time= 0.0s [CV 1/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.625 total time= 0.0s [CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.950 total time= 0.0s [CV 2/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.950 total time= 0.0s [CV 3/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.963 total time= 0.0s [CV 4/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.975 total time= 0.0s [CV 5/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.987 total time= 0.0s [CV 1/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.950 total time= 0.0s [CV 2/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.988 total time= 0.0s [CV 3/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.988 total time= 0.0s [CV 4/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.987 total time= 0.0s [CV 5/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.975 total time= 0.0s [CV 1/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.912 total time= 0.0s [CV 2/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.963 total time= 0.0s [CV 3/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.975 total time= 0.0s [CV 4/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.987 total time= 0.0s [CV 5/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.962 total time= 0.0s [CV 1/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.688 total time= 0.0s [CV 2/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.700 total time= 0.0s [CV 3/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.700 total time= 0.0s [CV 4/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.696 total time= 0.0s [CV 5/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=0.709 total time= 0.0s [CV 1/5] END .........C=10, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END .........C=10, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END .........C=10, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 4/5] END .........C=10, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END .........C=10, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.950 total time= 0.0s [CV 2/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.963 total time= 0.0s [CV 3/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.963 total time= 0.0s [CV 4/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.987 total time= 0.0s [CV 5/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.987 total time= 0.0s [CV 1/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.963 total time= 0.0s [CV 2/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.975 total time= 0.0s [CV 3/5] END ......C=10, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.0s [CV 4/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.975 total time= 0.0s [CV 5/5] END ......C=10, gamma=0.01, kernel=rbf;, score=0.987 total time= 0.0s [CV 1/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.950 total time= 0.0s [CV 2/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.988 total time= 0.0s [CV 3/5] END .....C=10, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.0s [CV 4/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.987 total time= 0.0s [CV 5/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.975 total time= 0.0s [CV 1/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.912 total time= 0.0s [CV 2/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.963 total time= 0.0s [CV 3/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.975 total time= 0.0s [CV 4/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.987 total time= 0.0s [CV 5/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=0.949 total time= 0.0s [CV 1/5] END ........C=100, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END ........C=100, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END ........C=100, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 4/5] END ........C=100, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END ........C=100, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.950 total time= 0.0s [CV 2/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.963 total time= 0.0s [CV 3/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.975 total time= 0.0s [CV 4/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.975 total time= 0.0s [CV 5/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.987 total time= 0.0s [CV 1/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.975 total time= 0.0s [CV 2/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.938 total time= 0.0s [CV 3/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.975 total time= 0.0s [CV 4/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.949 total time= 0.0s [CV 5/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.975 total time= 0.0s [CV 1/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.950 total time= 0.0s [CV 2/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.975 total time= 0.0s [CV 3/5] END ....C=100, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.0s [CV 4/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.975 total time= 0.0s [CV 5/5] END ....C=100, gamma=0.001, kernel=rbf;, score=0.987 total time= 0.0s [CV 1/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.950 total time= 0.0s [CV 2/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.988 total time= 0.0s [CV 3/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.0s [CV 4/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.987 total time= 0.0s [CV 5/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=0.975 total time= 0.0s [CV 1/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 2/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 3/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.637 total time= 0.0s [CV 4/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 5/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.633 total time= 0.0s [CV 1/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.950 total time= 0.0s [CV 2/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.963 total time= 0.0s [CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.975 total time= 0.0s [CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.975 total time= 0.0s [CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.987 total time= 0.0s [CV 1/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.950 total time= 0.0s [CV 2/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.950 total time= 0.0s [CV 3/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.950 total time= 0.0s [CV 4/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.937 total time= 0.0s [CV 5/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.975 total time= 0.0s [CV 1/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.975 total time= 0.0s [CV 2/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.963 total time= 0.0s [CV 3/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.975 total time= 0.0s [CV 4/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.949 total time= 0.0s [CV 5/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.975 total time= 0.0s [CV 1/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.950 total time= 0.0s [CV 2/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.975 total time= 0.0s [CV 3/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.0s [CV 4/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.975 total time= 0.0s [CV 5/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.987 total time= 0.0s
GridSearchCV(estimator=SVC(),
param_grid={'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']},
verbose=3)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=SVC(),
param_grid={'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']},
verbose=3)SVC()
SVC()
grid.best_params_
{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))
[[105 0]
[ 2 64]]
precision recall f1-score support
B 0.98 1.00 0.99 105
M 1.00 0.97 0.98 66
accuracy 0.99 171
macro avg 0.99 0.98 0.99 171
weighted avg 0.99 0.99 0.99 171